Warm Up

Import dt

Basic descriptives

## [1] "2021-11-07" "2022-05-30"
## [1] 40578
## # A tibble: 1 × 7
##     min   max  mean    q1 median    q3    sd
##   <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     1   713  201.    89   178.  292.  147.

Bigest profiles

Authors

Articles

Text analysis

##    SOURCE_TYPE       Avg       STD min max
## 1:   instagram 340.85211  26.48027 143 362
## 2:      reddit 274.77670 103.15160  20 356
## 3:     twitter 261.44314  89.66503  84 686
## 4:    facebook 150.19545  21.41469  20 172
## 5:         web  79.92884  30.07199   1 177
## 6:     youtube  56.72377  23.45628   9 133
## 7:       forum  39.34595  20.52029   4 127
## 8:     comment  22.00000   0.00000  22  22
##    SOURCE_TYPE       Avg        STD min   max
## 1:         web 4168.3613 5050.17995  86 33436
## 2:       forum 1411.3813 1490.12476  48  9651
## 3:     youtube 1031.4136  927.56866  31  5396
## 4:      reddit  909.1942 1630.09003  70 11632
## 5:   instagram  783.6479  455.22641 151  2269
## 6:    facebook  701.3283 1001.06685  20 32409
## 7:     comment  334.8745  301.80986  34  1556
## 8:     twitter  261.4431   89.66503  84   686
## [1] 40578    49
## [1] 22212336       49
## [1] 14003636       49

Basic freq

##                 word     N
##      1:     ukrajini 50545
##      2:     ukrajine 41509
##      3:        ruske 41269
##      4:     ukrajinu 34419
##      5:        ljudi 33801
##     ---                   
## 287472: nestašlucima     1
## 287473:    šaravanji     1
## 287474:        biško     1
## 287475:   puteševica     1
## 287476:    nekdašnja     1

Sentiment

## # A tibble: 1,145 × 2
##    FROM                                                                  negat…¹
##    <chr>                                                                   <dbl>
##  1 "RTL.hr Sport"                                                          16.7 
##  2 "Geopolitika.news"                                                      14.3 
##  3 "Prolaznik_Slučajni"                                                    11.1 
##  4 "Visokoin.com"                                                          11.1 
##  5 "Profitiraj.hr"                                                          8.70
##  6 "HRT OTVORENO"                                                           8.42
##  7 "Damir Vucić \U0001f1ed\U0001f1f7\U0001f1fa\U0001f1f8 \U0001f64f\U00…    7.69
##  8 "Fran Papac"                                                             7.69
##  9 "Ivan Kutlesa"                                                           7.69
## 10 "Ivana Cindrić"                                                          7.69
## # … with 1,135 more rows, and abbreviated variable name ¹​negativnostIndex
## # A tibble: 1,732 × 2
##    FROM                                                                pozitiv…¹
##    <chr>                                                                   <dbl>
##  1 "liburnija"                                                              22.2
##  2 "KAportal"                                                               18.8
##  3 "Mersed Hasana Trakic"                                                   16.7
##  4 "Vranjska Plus"                                                          16.7
##  5 "Požeška biskupija"                                                      14.7
##  6 "Glas Istre HR"                                                          14.3
##  7 "A.S.D. Settalese"                                                       13.3
##  8 "C60. \U0001f1ed\U0001f1f7\U0001f1fa\U0001f1e6\U0001f1ea\U0001f1fa"      13.3
##  9 "vinyldaewo"                                                             13.2
## 10 "Tihomir Mastelic-Ivic"                                                  12.8
## # … with 1,722 more rows, and abbreviated variable name ¹​pozitivnostIndex

Analysis of most liked posts

## [1] 1059   49
##                      FROM   N
##   1:             index.hr 158
##   2:          jutarnji.hr 131
##   3:            24sata.hr 102
##   4:           dnevnik.hr  58
##   5: slobodnadalmacija.hr  57
##  ---                         
## 162:         SDP Hrvatske   1
## 163:  dubrovackiportal.hr   1
## 164:         fightsite.hr   1
## 165:               hcl.hr   1
## 166:         Osječki taxi   1
##                      FROM  LIKES
##   1:            24sata.hr 260735
##   2:             index.hr 215720
##   3:          jutarnji.hr 167645
##   4: slobodnadalmacija.hr 102494
##   5:           dnevnik.hr  68541
##  ---                            
## 162:         SDP Hrvatske    523
## 163:  dubrovackiportal.hr    520
## 164:         fightsite.hr    517
## 165:               hcl.hr    514
## 166:         Osječki taxi    500
## [1] 788444     49
##             word    N
##     1:  ukrajini 1615
##     2:     ruske 1519
##     3:     ljudi 1275
##     4:  ukrajinu 1275
##     5:  ukrajine 1247
##    ---               
## 60216:  brodskih    1
## 60217: eskadrila    1
## 60218:  razarača    1
## 60219:   fregatu    1
## 60220: pomorskim    1

Term importance

## Udio riječi po domenama
domenaWords <- fb_tokenTidy_TopLike %>%
  filter(FROM %in% c("24sata", "jutarnji.hr", "slobodnadalmacija.hr", "Index.hr" )) %>% 
  count(FROM, word, sort = T)
  
ukupnoWords <- domenaWords %>%
  group_by(FROM) %>%
  summarise(totWords = sum(n))
domenaWords <- left_join(domenaWords, ukupnoWords)
# domenaWords %>% head(15)
# domenaWords %>% 
# ggplot(., aes(n/totWords, fill = domena)) +
#   geom_histogram(show.legend = FALSE) +
#   xlim(NA, 0.0009) +
#   facet_wrap(~domena, ncol = 2, scales = "free_y")
## Najbitnije riječi po domenma
idf <- domenaWords %>%
  bind_tf_idf(word, FROM, n)
#idf %>% head(10)
# idf %>% 
#   select(-totWords) %>%
#   arrange(desc(tf_idf))
idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  mutate(FROM = factor(FROM)) %>%
  group_by(FROM) %>% 
  top_n(11) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = FROM)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~FROM, ncol = 2, scales = "free") +
  coord_flip() +
  theme_economist()

Phrases

fb_bigram <- fb_TopLike %>%
  unnest_tokens(bigram, FULL_TEXT, token = "ngrams", n = 2)
#fb_bigram %>% head(10)
# fb_bigram %>%
#   count(bigram, sort = T) %>%
#   head(25) 
fb_bigram_sep <- fb_bigram %>%
  separate(bigram, c("word1","word2"), sep = " ")
fb_bigram_tidy <- fb_bigram_sep %>%
  filter(!word1 %in% stop_corpus$word) %>%
  filter(!word2 %in% stop_corpus$word) %>%
  mutate(word1 = gsub("\\d+", NA, word1)) %>%
  mutate(word2 = gsub("\\d+", NA, word2)) %>%
  mutate(word1 = gsub("^[a-zA-Z]$", NA, word1)) %>%
  mutate(word2 = gsub("^[a-zA-Z]$", NA, word2)) 
fb_bigram_tidy_bigram_counts <- fb_bigram_tidy %>% 
  count(word1, word2, sort = TRUE)

bigrams_united <- fb_bigram_tidy %>%
  unite(bigram, word1, word2, sep = " ") %>%
  filter(., !grepl("NA",bigram))
#bigrams_united
bigrams_united %>% 
  count(FROM,bigram,sort = T) -> topicBigram

bigrams_united %>%
  count(bigram, sort = T) %>%
  head(45) 
##                      bigram   n
## 1               ruske snage 390
## 2          vanjskih poslova 337
## 3    ukrajinski predsjednik 241
## 4            ruske invazije 230
## 5            vladimir putin 217
## 6       ministarstvo obrane 190
## 7     predsjednik volodimir 190
## 8          pročitajte ovdje 168
## 9        volodimir zelenski 162
## 10             ruska vojska 153
## 11            milijuna kuna 134
## 12        ministar vanjskih 133
## 13        ruski predsjednik 132
## 14       rusko ministarstvo 127
## 15     predsjednik vladimir 126
## 16         ukrajinske snage 125
## 17       društvenim mrežama 124
## 18        ukrajinska vojska 121
## 19            nekoliko dana 119
## 20             ispod oglasa 112
## 21          nastavlja ispod 112
## 22       humanitarne pomoći 109
## 23              ruske trupe 109
## 24         vladimira putina 109
## 25            protiv rusije 103
## 26         ruske federacije 102
## 27            crvenog križa  99
## 28        humanitarnu pomoć  98
## 29           ruskih vojnika  95
## 30       humanitarnu akciju  93
## 31           europske unije  89
## 32    ministarstvo vanjskih  89
## 33            ratne zločine  89
## 34        sjedinjene države  89
## 35              godinu dana  88
## 36                svaki dan  88
## 37 ukrajinskog predsjednika  88
## 38             ruskih snaga  85
## 39      volodimir zelenskij  80
## 40           ruske agresije  79
## 41          ministar obrane  78
## 42            oružane snage  78
## 43              mjesec dana  77
## 44        novinska agencija  77
## 45      ukrajinski ministar  77
# Najvažniji bigrami po domenama
 bigram_tf_idf <- bigrams_united %>% 
#  filter (!is.na(bigram)) %>%
  count(FROM, bigram) %>%
  bind_tf_idf(bigram, FROM, n) %>%
  arrange(desc(tf_idf))
bigram_tf_idf %>%
  filter(FROM %in% c("24sata", "jutarnji.hr", "slobodnadalmacija.hr", "Index.hr")) %>%
  arrange(desc(tf_idf)) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(FROM) %>% 
  top_n(20) %>% 
  ungroup() %>%
  ggplot(aes(bigram, tf_idf, fill = FROM)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~FROM, ncol = 2, scales = "free") +
  coord_flip() + 
  theme_economist()

PHRASES CORRELATION

fb_tokenTidy %>% 
#  filter(datum > "2020-02-20") %>%
  group_by(word) %>%
  filter(n() > 200) %>%
  filter(!is.na(word)) %>%
  pairwise_cor(word,DATE, sort = T) -> corsWords
#corsWords %>%
#  filter(item1 == "oporavak")
corsWords %>%
  filter(item1 %in% c("kupnja", "akcija", "poklon")) %>%
  group_by(item1) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip() + 
  theme_economist()

TEMATIC ANALYSIS

fb_tokenTidy_TopLike %>%
  count(FROM, word, sort = TRUE) %>%
  cast_dtm(FROM, word,n) -> dtm
fb_LDA <- LDA(dtm, k = 4,  control = list(seed = 1234))
fb_LDA_tidy <- tidy(fb_LDA, matrix = "beta")
#newsCOVID_LDA_tidy
insta_terms <- fb_LDA_tidy %>%
  drop_na(.) %>%
  group_by(topic) %>%
  top_n(15, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
#newsCOVID_terms
insta_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered() + 
  theme_economist()